In [1]:
# Logistic Regression on Titanic Dataset

# Goal: Predict survival on the Titanic using logistic regression.

# Broad Steps to Solve the Problem:
# 1. Import necessary libraries
# 2. Load the dataset
# 3. Explore the dataset ( to see if there are any missing values, outliers, etc., and also to check if there is linear relationship between the features and the target variable)
# 4. Encoding categorical variables (if any)
# 5. Separate the independent variables (features) to X and dependent variable (target) to y
# 6. Split the dataset into training and testing sets
# 7. Scale the independent variables (if necessary). You will not scale the dependent variable.
# 8. Modeling - apply the Multiple Linear Regression algorithm or any other algorithm of your choice
# 9. Make predictions on the test set
# 10. Evaluate the model performance using appropriate metrics (e.g., R-squared, Mean Absolute Error, etc.)
# 11. Visualize the results (if necessary)
# 12. Save the model (if necessary)
# 13. Document the findings and conclusions
# 14. Share the results with stakeholders (if necessary)
# 15. Deploy the model (if necessary)
In [2]:
# Lets do the necessary imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')
In [3]:
df = pd.read_csv('titanic.csv')
df.head()
Out[3]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
In [4]:
df.shape
Out[4]:
(891, 12)
In [5]:
# Note:
# Our data set was already explored during our EDA class. So by default, it is expected that you explore every project before coming for any conclusion. Since we have already explored and we remember the findings, I will be doing minimal EDA and straightaway jumping into modelling.

# However, for any real life use case you are strongly recommended to focus more on EDA to extract the relevant features and get the insights from the data before proceeding to modelling.
In [6]:
df.isna().sum().sort_values(ascending=False)
Out[6]:
Cabin          687
Age            177
Embarked         2
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
SibSp            0
Parch            0
Ticket           0
Fare             0
dtype: int64
In [7]:
# Cabin has a lot of missing values, so we will drop it.
df.drop('Cabin', axis=1, inplace=True)

df.isna().sum().sort_values(ascending=False)
Out[7]:
Age            177
Embarked         2
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
SibSp            0
Parch            0
Ticket           0
Fare             0
dtype: int64
In [8]:
df.Embarked.value_counts()
Out[8]:
Embarked
S    644
C    168
Q     77
Name: count, dtype: int64
In [9]:
# Since most of the embarked values are S, we will fill the missing values with S.
df['Embarked'].fillna('S', inplace=True)

df.isna().sum().sort_values(ascending=False)
Out[9]:
Age            177
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         0
dtype: int64
In [10]:
df.Age.skew()
Out[10]:
0.38910778230082704
In [11]:
# If this value is between:

# ==> -0.5 and 0.5, the distribution of the value is almost symmetrical
# ==> -1 and -0.5, the data is negatively skewed, and if it is between 0.5 to 1, the data is positively skewed. The skewness is moderate.
# ==> If the skewness is lower than -1 (negatively skewed) or greater than 1 (positively skewed), the data is highly skewed.
In [12]:
# Since the skewness is between -0.5 and 0.5, we can say that the distribution of the age is almost symmetrical.
# Lets fill the missing values with the mean of the age.
df['Age'].fillna(df['Age'].mean(), inplace=True)

# Now lets check if there are any missing values left.
df.isna().sum().sort_values(ascending=False)
Out[12]:
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64
In [13]:
df.head()
Out[13]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 S
In [14]:
# Feature Engineering

# FamilySize = SibSp + Parch + 1
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

df.head()
Out[14]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Embarked FamilySize
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 S 2
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C 2
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 S 1
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 S 2
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 S 1
In [15]:
# isAlone = 1 if FamilySize == 1 else 0
df['isAlone'] = np.where(df['FamilySize'] == 1, 1, 0)

df.head()
Out[15]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Embarked FamilySize isAlone
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 S 2 0
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C 2 0
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 S 1 1
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 S 2 0
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 S 1 1
In [16]:
# GenderClass
# If Age<15, then child
# else Sex (male/female)
df['GenderClass'] = df.apply(lambda dff: 'child' if dff['Age'] < 15 else dff['Sex'], axis=1)

df.head()
Out[16]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Embarked FamilySize isAlone GenderClass
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 S 2 0 male
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C 2 0 female
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 S 1 1 female
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 S 2 0 female
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 S 1 1 male
In [17]:
df.GenderClass.value_counts()
Out[17]:
GenderClass
male      538
female    275
child      78
Name: count, dtype: int64
In [18]:
# Title Extraction
# Extracting the title from the Name column
df['Title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

df.head()
Out[18]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Embarked FamilySize isAlone GenderClass Title
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 S 2 0 male Mr
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C 2 0 female Mrs
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 S 1 1 female Miss
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 S 2 0 female Mrs
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 S 1 1 male Mr
In [19]:
df.Title.value_counts()
Out[19]:
Title
Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Major             2
Col               2
the Countess      1
Capt              1
Ms                1
Sir               1
Lady              1
Mme               1
Don               1
Jonkheer          1
Name: count, dtype: int64
In [20]:
# Lets group the titles into broader categories
# Grouping titles into broader categories
title_mapping = {
    'Mr': 'Mr',
    'Mrs': 'Mrs',
    'Miss': 'Miss',
    'Master': 'Master',
    'Don': 'Mr',
    'Rev': 'Mr',
    'Dr': 'Mr',
    'Mme': 'Mrs',
    'Ms': 'Miss',
    'Major': 'Mr',
    'Lady': 'Mrs',
    'Sir': 'Mr',
    'Mlle': 'Miss',
    'Col': 'Mr',
    'Capt': 'Mr',
    'the Countess': 'Mrs',
    'Jonkheer': 'Mr'
}

df['Title'] = df['Title'].map(title_mapping)

# Now lets check the value counts of the titles
df.Title.value_counts()
Out[20]:
Title
Mr        538
Miss      185
Mrs       128
Master     40
Name: count, dtype: int64
In [21]:
df.head()
Out[21]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Embarked FamilySize isAlone GenderClass Title
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 S 2 0 male Mr
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C 2 0 female Mrs
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 S 1 1 female Miss
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 S 2 0 female Mrs
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 S 1 1 male Mr
In [22]:
# Drop columns that are not needed for the model: PassengerId, Name, Sex, SibSp, Parch, Ticket
df.drop(['PassengerId', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket'], axis=1, inplace=True)

df.head()
Out[22]:
Survived Pclass Age Fare Embarked FamilySize isAlone GenderClass Title
0 0 3 22.0 7.2500 S 2 0 male Mr
1 1 1 38.0 71.2833 C 2 0 female Mrs
2 1 3 26.0 7.9250 S 1 1 female Miss
3 1 1 35.0 53.1000 S 2 0 female Mrs
4 0 3 35.0 8.0500 S 1 1 male Mr
In [23]:
# One-hot encoding / Dummification of columns: Embarked, GenderClass, Title
df = pd.get_dummies(df, columns=['Embarked', 'GenderClass', 'Title'], drop_first=True, dtype=int)
df.head()
Out[23]:
Survived Pclass Age Fare FamilySize isAlone Embarked_Q Embarked_S GenderClass_female GenderClass_male Title_Miss Title_Mr Title_Mrs
0 0 3 22.0 7.2500 2 0 0 1 0 1 0 1 0
1 1 1 38.0 71.2833 2 0 0 0 1 0 0 0 1
2 1 3 26.0 7.9250 1 1 0 1 1 0 1 0 0
3 1 1 35.0 53.1000 2 0 0 1 1 0 0 0 1
4 0 3 35.0 8.0500 1 1 0 1 0 1 0 1 0
In [24]:
# Split the data into features and target variable
X = df.drop('Survived', axis=1)
y = df['Survived']
In [25]:
# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

# Print the shapes of the train and test sets
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
X_train shape: (712, 12), y_train shape: (712,)
X_test shape: (179, 12), y_test shape: (179,)
In [26]:
X_train.head()
Out[26]:
Pclass Age Fare FamilySize isAlone Embarked_Q Embarked_S GenderClass_female GenderClass_male Title_Miss Title_Mr Title_Mrs
502 3 29.699118 7.6292 1 1 1 0 1 0 1 0 0
464 3 29.699118 8.0500 1 1 0 1 0 1 0 1 0
198 3 29.699118 7.7500 1 1 1 0 1 0 1 0 0
765 1 51.000000 77.9583 2 0 0 1 1 0 0 0 1
421 3 21.000000 7.7333 1 1 1 0 0 1 0 1 0
In [27]:
features_to_be_scaled = ['Age', 'Fare']

# Scaling the features
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[features_to_be_scaled] = sc.fit_transform(X_train[features_to_be_scaled])
X_test[features_to_be_scaled] = sc.transform(X_test[features_to_be_scaled])
In [28]:
X_train.head()
Out[28]:
Pclass Age Fare FamilySize isAlone Embarked_Q Embarked_S GenderClass_female GenderClass_male Title_Miss Title_Mr Title_Mrs
502 3 0.014210 -0.482061 1 1 1 0 1 0 1 0 0
464 3 0.014210 -0.474085 1 1 0 1 0 1 0 1 0
198 3 0.014210 -0.479771 1 1 1 0 1 0 1 0 0
765 1 1.674826 0.850885 2 0 0 1 1 0 0 0 1
421 3 -0.663973 -0.480088 1 1 1 0 0 1 0 1 0
In [29]:
# Modeling - apply the Logistic Regression algorithm
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Predicting on training and test sets
y_pred_train = logreg.predict(X_train)
y_pred_test = logreg.predict(X_test)

# Evaluating the model performance
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
train_acc = accuracy_score(y_train, y_pred_train)
test_acc = accuracy_score(y_test, y_pred_test)
print(f"Training Accuracy: {train_acc}")
print(f"Testing Accuracy: {test_acc}")
Training Accuracy: 0.8328651685393258
Testing Accuracy: 0.8156424581005587
Aspect Observation
Accuracy Score Both training and testing accuracies are above 80%, which is strong baseline performance for this dataset.
No Signs of Overfitting Since training and testing scores are close, model does not overfit.
Suitable for Deployment Given the balanced accuracy and simplicity of logistic regression, this model can serve as a baseline classifier.
Potential Next Steps Consider adding more features, trying regularization (C parameter), or comparing with other classifiers (e.g., Random Forest, XGBoost) to improve performance.
In [30]:
confusion_matrix(y_test, y_pred_test)
Out[30]:
array([[98, 12],
       [21, 48]])
In [34]:
print(classification_report(y_test, y_pred_test))
              precision    recall  f1-score   support

           0       0.82      0.89      0.86       110
           1       0.80      0.70      0.74        69

    accuracy                           0.82       179
   macro avg       0.81      0.79      0.80       179
weighted avg       0.81      0.82      0.81       179

In [31]:
confusion_mat = confusion_matrix(y_test, y_pred_test)
cm_df = pd.DataFrame(confusion_mat, index=['Actually Died', 'Actually Survived'],
                      columns=['Predicted Died', 'Predicted Survived'])
cm_df
Out[31]:
Predicted Died Predicted Survived
Actually Died 98 12
Actually Survived 21 48

Confusion Matrix:¶

Predicted Died Predicted Survived
Actually Died 98 (True Negatives) 12 (False Positives)
Actually Survived 21 (False Negatives) 48 (True Positives)

Key Metrics and Interpretations:¶

  1. Accuracy:

    $$ \text{Accuracy} = \frac{TP + TN}{TP + TN + FP + FN} = \frac{48 + 98}{48 + 98 + 12 + 21} = \frac{146}{179} \approx 81.56\% $$
    • The model correctly predicts the outcome about 81.56% of the time.
  2. Precision (Survived):

    $$ \text{Precision} = \frac{TP}{TP + FP} = \frac{48}{48 + 12} = \frac{48}{60} = 80\% $$
    • When the model predicts someone will survive, it's correct 80% of the time.
  3. Recall (Survived):

    $$ \text{Recall} = \frac{TP}{TP + FN} = \frac{48}{48 + 21} = \frac{48}{69} \approx 69.57\% $$
    • The model identifies about 69.57% of the actual survivors.
  4. Precision (Died):

    $$ \frac{TN}{TN + FN} = \frac{98}{98 + 21} = \frac{98}{119} \approx 82.35\% $$
  5. Recall (Died):

    $$ \frac{TN}{TN + FP} = \frac{98}{98 + 12} = \frac{98}{110} \approx 89.09\% $$

Insights:¶

  • The model is more accurate at predicting deaths than survivals, with better recall and precision on the "Died" class.
  • It struggles more with false negatives (21 people who survived but were predicted to die), which is critical in survival scenarios.
  • Balanced performance: While not perfect, both classes are reasonably well predicted, indicating the model is not heavily biased toward one outcome.

In [ ]:
# Calculate classification report
class_report = classification_report(y_test, y_pred_test, output_dict=True)
class_report_df = pd.DataFrame(class_report).transpose()
class_report_df
# f1 score is the harmonic mean of precision and recall.
# It is a measure of a model's accuracy that considers both the precision and the recall.
# It is particularly useful when the class distribution is imbalanced.
# f1 score formula:
# https://images.prismic.io/encord/0ef9c82f-2857-446e-918d-5f654b9d9133_Screenshot+%2849%29.png?auto=compress,format
Out[ ]:
precision recall f1-score support
0 0.823529 0.890909 0.855895 110.000000
1 0.800000 0.695652 0.744186 69.000000
accuracy 0.815642 0.815642 0.815642 0.815642
macro avg 0.811765 0.793281 0.800041 179.000000
weighted avg 0.814459 0.815642 0.812834 179.000000
Metric Class Value Interpretation
Precision 0 0.8235 82.35% of predicted non-survivors were actually non-survivors
Precision 1 0.8000 80.00% of predicted survivors were actually survivors
Recall 0 0.8909 Model identified 89.09% of actual non-survivors correctly
Recall 1 0.6957 Model identified 69.57% of actual survivors correctly
F1-Score 0 0.8559 Balanced performance for non-survivor prediction
F1-Score 1 0.7442 Balanced performance for survivor prediction
Support 0 110 Number of actual non-survivors in the test set
Support 1 69 Number of actual survivors in the test set

Overall Evaluation Metrics¶

Metric Value Interpretation
Accuracy 0.8156 Overall, 81.56% of total predictions are correct
Macro Avg F1 0.8000 Average F1-score treating both classes equally (no weighting)
Weighted Avg F1 0.8128 Average F1-score considering class imbalance (more realistic)
Macro Avg Recall 0.7933 Average recall of both classes, unweighted
Weighted Avg Precision 0.8145 Overall precision adjusted for class sizes

image.png

image.png

image.png


image.png

image.png

In [36]:
logreg.predict(X_test)
Out[36]:
array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0])
In [ ]:
logreg.predict_proba(X_test)[:, 1]

image.png

In [37]:
logreg.predict_proba(X_test)
Out[37]:
array([[9.67908071e-01, 3.20919292e-02],
       [9.32490994e-01, 6.75090064e-02],
       [8.65873955e-01, 1.34126045e-01],
       [9.23976595e-01, 7.60234054e-02],
       [3.84356070e-02, 9.61564393e-01],
       [6.20498119e-01, 3.79501881e-01],
       [9.03077394e-01, 9.69226056e-02],
       [9.13470087e-01, 8.65299135e-02],
       [3.39534138e-01, 6.60465862e-01],
       [9.06617655e-01, 9.33823455e-02],
       [9.12827333e-01, 8.71726670e-02],
       [5.96201798e-01, 4.03798202e-01],
       [2.75296948e-01, 7.24703052e-01],
       [9.27400014e-01, 7.25999856e-02],
       [9.12827333e-01, 8.71726670e-02],
       [5.21113456e-02, 9.47888654e-01],
       [6.06922862e-01, 3.93077138e-01],
       [9.01256533e-01, 9.87434674e-02],
       [2.60674516e-01, 7.39325484e-01],
       [4.01662220e-01, 5.98337780e-01],
       [9.31276462e-01, 6.87235384e-02],
       [8.58004847e-01, 1.41995153e-01],
       [6.99629270e-01, 3.00370730e-01],
       [2.56303571e-01, 7.43696429e-01],
       [7.05838749e-01, 2.94161251e-01],
       [2.43295899e-01, 7.56704101e-01],
       [9.51929778e-01, 4.80702216e-02],
       [6.95653752e-01, 3.04346248e-01],
       [9.24910963e-01, 7.50890366e-02],
       [9.77069011e-01, 2.29309894e-02],
       [5.43012866e-01, 4.56987134e-01],
       [9.27400014e-01, 7.25999856e-02],
       [5.47293715e-02, 9.45270628e-01],
       [8.88056804e-01, 1.11943196e-01],
       [3.98296262e-02, 9.60170374e-01],
       [9.60492921e-01, 3.95070794e-02],
       [9.13035201e-01, 8.69647987e-02],
       [3.31748978e-01, 6.68251022e-01],
       [1.37269217e-01, 8.62730783e-01],
       [2.86207326e-01, 7.13792674e-01],
       [8.08235733e-01, 1.91764267e-01],
       [3.67540277e-01, 6.32459723e-01],
       [5.80394893e-01, 4.19605107e-01],
       [9.68583494e-01, 3.14165058e-02],
       [8.46766468e-01, 1.53233532e-01],
       [9.19318807e-01, 8.06811931e-02],
       [1.54176510e-01, 8.45823490e-01],
       [5.22740271e-02, 9.47725973e-01],
       [9.56720169e-01, 4.32798306e-02],
       [9.34672512e-01, 6.53274882e-02],
       [9.15628594e-01, 8.43714058e-02],
       [7.01049150e-01, 2.98950850e-01],
       [9.08562354e-01, 9.14376463e-02],
       [4.67207689e-01, 5.32792311e-01],
       [7.87947144e-01, 2.12052856e-01],
       [9.52242959e-01, 4.77570410e-02],
       [9.40814439e-01, 5.91855614e-02],
       [9.10074330e-01, 8.99256701e-02],
       [4.04961059e-02, 9.59503894e-01],
       [7.49863927e-02, 9.25013607e-01],
       [7.65781503e-01, 2.34218497e-01],
       [9.61099669e-01, 3.89003311e-02],
       [3.44666821e-01, 6.55333179e-01],
       [3.62929317e-01, 6.37070683e-01],
       [9.25473994e-01, 7.45260063e-02],
       [9.04530122e-01, 9.54698783e-02],
       [6.33993386e-01, 3.66006614e-01],
       [3.79729654e-01, 6.20270346e-01],
       [3.86022571e-01, 6.13977429e-01],
       [7.87947144e-01, 2.12052856e-01],
       [7.19645365e-01, 2.80354635e-01],
       [3.41635048e-01, 6.58364952e-01],
       [3.45296195e-01, 6.54703805e-01],
       [9.46015592e-01, 5.39844076e-02],
       [5.47787202e-01, 4.52212798e-01],
       [9.27570408e-01, 7.24295916e-02],
       [9.09837005e-01, 9.01629947e-02],
       [2.89837380e-02, 9.71016262e-01],
       [3.41646077e-01, 6.58353923e-01],
       [3.06634964e-01, 6.93365036e-01],
       [6.34962408e-01, 3.65037592e-01],
       [9.10942451e-01, 8.90575495e-02],
       [5.73434945e-01, 4.26565055e-01],
       [9.03001987e-01, 9.69980135e-02],
       [9.53287912e-01, 4.67120879e-02],
       [7.21847153e-02, 9.27815285e-01],
       [9.18875604e-01, 8.11243955e-02],
       [7.30602642e-01, 2.69397358e-01],
       [4.79612417e-02, 9.52038758e-01],
       [4.65091905e-01, 5.34908095e-01],
       [5.75160632e-01, 4.24839368e-01],
       [8.10918085e-02, 9.18908192e-01],
       [4.91193360e-02, 9.50880664e-01],
       [9.27400014e-01, 7.25999856e-02],
       [4.88153074e-01, 5.11846926e-01],
       [7.70340183e-01, 2.29659817e-01],
       [3.45746107e-01, 6.54253893e-01],
       [9.83302731e-01, 1.66972688e-02],
       [9.05865105e-01, 9.41348952e-02],
       [9.15370571e-01, 8.46294293e-02],
       [3.41601959e-01, 6.58398041e-01],
       [8.75703793e-01, 1.24296207e-01],
       [4.63724767e-01, 5.36275233e-01],
       [5.65473459e-01, 4.34526541e-01],
       [3.15970295e-01, 6.84029705e-01],
       [4.41414738e-02, 9.55858526e-01],
       [4.74809434e-01, 5.25190566e-01],
       [9.48474764e-01, 5.15252363e-02],
       [9.19554599e-01, 8.04454011e-02],
       [5.22327426e-02, 9.47767257e-01],
       [8.86226334e-01, 1.13773666e-01],
       [8.22740986e-01, 1.77259014e-01],
       [8.28536898e-01, 1.71463102e-01],
       [9.04343632e-01, 9.56563677e-02],
       [6.86586837e-01, 3.13413163e-01],
       [1.75278591e-01, 8.24721409e-01],
       [7.96402273e-01, 2.03597727e-01],
       [5.80994262e-01, 4.19005738e-01],
       [9.14990373e-01, 8.50096273e-02],
       [6.07040781e-01, 3.92959219e-01],
       [9.22394910e-01, 7.76050901e-02],
       [5.87895097e-02, 9.41210490e-01],
       [9.13082463e-01, 8.69175374e-02],
       [9.62333355e-01, 3.76666451e-02],
       [8.88346320e-01, 1.11653680e-01],
       [2.96837651e-01, 7.03162349e-01],
       [8.59619363e-01, 1.40380637e-01],
       [7.92191479e-01, 2.07808521e-01],
       [9.27916730e-01, 7.20832698e-02],
       [1.81208719e-01, 8.18791281e-01],
       [9.34433617e-01, 6.55663826e-02],
       [9.06617655e-01, 9.33823455e-02],
       [3.41521054e-01, 6.58478946e-01],
       [5.71424359e-01, 4.28575641e-01],
       [8.16337740e-01, 1.83662260e-01],
       [4.66930075e-01, 5.33069925e-01],
       [9.15628594e-01, 8.43714058e-02],
       [4.90409243e-01, 5.09590757e-01],
       [7.95418207e-01, 2.04581793e-01],
       [9.17066861e-01, 8.29331394e-02],
       [9.83302731e-01, 1.66972688e-02],
       [9.20732227e-01, 7.92677730e-02],
       [9.14236159e-01, 8.57638414e-02],
       [9.27359274e-01, 7.26407259e-02],
       [5.51438379e-02, 9.44856162e-01],
       [9.31227284e-01, 6.87727155e-02],
       [9.49415192e-01, 5.05848081e-02],
       [9.14977678e-01, 8.50223222e-02],
       [9.26647919e-01, 7.33520813e-02],
       [9.01649548e-01, 9.83504519e-02],
       [8.71488763e-01, 1.28511237e-01],
       [9.22530372e-01, 7.74696280e-02],
       [4.87769795e-02, 9.51223021e-01],
       [4.94119687e-01, 5.05880313e-01],
       [5.02006072e-02, 9.49799393e-01],
       [5.18099117e-02, 9.48190088e-01],
       [2.78587130e-01, 7.21412870e-01],
       [8.94569106e-01, 1.05430894e-01],
       [9.86025730e-01, 1.39742697e-02],
       [6.34084414e-01, 3.65915586e-01],
       [2.63656537e-01, 7.36343463e-01],
       [9.99188708e-01, 8.11292315e-04],
       [6.73831774e-02, 9.32616823e-01],
       [9.25670915e-01, 7.43290848e-02],
       [3.70851239e-01, 6.29148761e-01],
       [4.40582285e-02, 9.55941772e-01],
       [9.64006268e-01, 3.59937322e-02],
       [1.95900368e-01, 8.04099632e-01],
       [6.79658376e-01, 3.20341624e-01],
       [8.04546748e-01, 1.95453252e-01],
       [9.27623103e-01, 7.23768966e-02],
       [9.10567581e-01, 8.94324192e-02],
       [9.09257210e-01, 9.07427904e-02],
       [9.11914394e-01, 8.80856060e-02],
       [7.70981769e-01, 2.29018231e-01],
       [2.48598425e-01, 7.51401575e-01],
       [9.18947456e-01, 8.10525437e-02],
       [9.04453244e-01, 9.55467563e-02],
       [8.92305350e-01, 1.07694650e-01]])
In [ ]:
# Calculate and plot AUC ROC curve
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:, 1])
# fpr - False Positive Rate
# tpr - True Positive Rate
# thresholds - Thresholds for the ROC curve
roc_auc = auc(fpr, tpr)
# auc - Area Under the Curve i.e. this will help us to understand the model performance better

plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='blue', label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Random Guessing')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.legend()
plt.grid()
plt.show()
No description has been provided for this image
Metric Meaning
AUC = 0.85 This indicates that there's an 85% chance the model ranks a randomly chosen survivor higher than a randomly chosen non-survivor.
High TPR at Low FPR The model correctly identifies most survivors early on, while making few false positive errors.
General Shape The ROC curve is bowed toward the top-left, indicating good separation between classes.

Conclusion¶

Aspect Assessment
Model Discrimination Strong – able to distinguish survivors from non-survivors
Threshold Tuning Need Optional – current threshold seems reasonable, but could be optimized for better recall or precision based on use-case
Next Step Suggestion Consider precision-recall curve if the positive class (survived) is more critical to capture
In [38]:
# Setting the threshold to 0.5
threshold = 0.5
logreg.predict_proba(X_test)[:, 1] > threshold
Out[38]:
array([False, False, False, False,  True, False, False, False,  True,
       False, False, False,  True, False, False,  True, False, False,
        True,  True, False, False, False,  True, False,  True, False,
       False, False, False, False, False,  True, False,  True, False,
       False,  True,  True,  True, False,  True, False, False, False,
       False,  True,  True, False, False, False, False, False,  True,
       False, False, False, False,  True,  True, False, False,  True,
        True, False, False, False,  True,  True, False, False,  True,
        True, False, False, False, False,  True,  True,  True, False,
       False, False, False, False,  True, False, False,  True,  True,
       False,  True,  True, False,  True, False,  True, False, False,
       False,  True, False,  True, False,  True,  True,  True, False,
       False,  True, False, False, False, False, False,  True, False,
       False, False, False, False,  True, False, False, False,  True,
       False, False, False,  True, False, False,  True, False, False,
        True, False,  True, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False,  True,
        True,  True,  True,  True, False, False, False,  True, False,
        True, False,  True,  True, False,  True, False, False, False,
       False, False, False, False,  True, False, False, False])
In [39]:
# Setting the threshold to 0.75
threshold = 0.75
logreg.predict_proba(X_test)[:, 1] > threshold
Out[39]:
array([False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False,  True, False,  True, False,
       False, False,  True, False, False, False, False, False, False,
       False,  True,  True, False, False, False, False, False, False,
       False, False, False, False,  True,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False,  True, False, False,  True, False,
       False,  True,  True, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False,  True, False, False, False, False, False,  True, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False,  True,
       False,  True,  True, False, False, False, False, False, False,
        True, False, False,  True, False,  True, False, False, False,
       False, False, False, False,  True, False, False, False])
In [40]:
# Setting the threshold to 0.25
threshold = 0.25
logreg.predict_proba(X_test)[:, 1] > threshold
Out[40]:
array([False, False, False, False,  True,  True, False, False,  True,
       False, False,  True,  True, False, False,  True,  True, False,
        True,  True, False, False,  True,  True,  True,  True, False,
        True, False, False,  True, False,  True, False,  True, False,
       False,  True,  True,  True, False,  True,  True, False, False,
       False,  True,  True, False, False, False,  True, False,  True,
       False, False, False, False,  True,  True, False, False,  True,
        True, False, False,  True,  True,  True, False,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False, False,  True, False,  True,  True,  True,
        True,  True,  True, False,  True, False,  True, False, False,
       False,  True, False,  True,  True,  True,  True,  True, False,
       False,  True, False, False, False, False,  True,  True, False,
        True, False,  True, False,  True, False, False, False,  True,
       False, False, False,  True, False, False,  True,  True, False,
        True, False,  True, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False,  True,
        True,  True,  True,  True, False, False,  True,  True, False,
        True, False,  True,  True, False,  True,  True, False, False,
       False, False, False, False,  True, False, False, False])
In [42]:
# Setting the threshold to 0.5
threshold = 0.5
pred_05 = np.where(logreg.predict_proba(X_test)[:, 1] > threshold, 1, 0)
# Setting the threshold to 0.75
threshold = 0.75
pred_075 = np.where(logreg.predict_proba(X_test)[:, 1] > threshold, 1, 0)
# Setting the threshold to 0.25
threshold = 0.25
pred_025 = np.where(logreg.predict_proba(X_test)[:, 1] > threshold, 1, 0)

# Printing predictions for different thresholds
print("Predictions with threshold 0.5:", pred_05)
print("\n\nPredictions with threshold 0.75:", pred_075)
print("\n\nPredictions with threshold 0.25:", pred_025)
Predictions with threshold 0.5: [0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1 0 0 1 1 0 0 0 1 0 1 0 0 0 0 0 0 1 0 1 0 0
 1 1 1 0 1 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 1 1 0 0 1 1 0 0 0 1 1 0 0 1 1 0
 0 0 0 1 1 1 0 0 0 0 0 1 0 0 1 1 0 1 1 0 1 0 1 0 0 0 1 0 1 0 1 1 1 0 0 1 0
 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 1 1 1 1 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0]


Predictions with threshold 0.75: [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0
 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0
 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0]


Predictions with threshold 0.25: [0 0 0 0 1 1 0 0 1 0 0 1 1 0 0 1 1 0 1 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 1 0 0
 1 1 1 0 1 1 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 0 0 1 1 0 0 1 1 1 0 1 1 1 0
 1 0 0 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 1 0 1 0 1 0 0 0 1 0 1 1 1 1 1 0 0 1 0
 0 0 0 1 1 0 1 0 1 0 1 0 0 0 1 0 0 0 1 0 0 1 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 1 1 1 1 1 0 0 1 1 0 1 0 1 1 0 1 1 0 0 0 0 0 0 1 0 0 0]
In [51]:
# Lets try the model performance with different thresholds
numbers = [float(x)/20 for x in range(0, 11)]
print("Thresholds:", numbers)

cutoff_df = pd.DataFrame(zip(y_test, logreg.predict_proba(X_test)[:, 1], np.where(logreg.predict_proba(X_test)[:, 1] > 0.5, 1, 0)),
                         columns=['Actual', 'Predicted_Prob', 'Predicted_Class'])
cutoff_df
Thresholds: [0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
Out[51]:
Actual Predicted_Prob Predicted_Class
0 0 0.032092 0
1 0 0.067509 0
2 0 0.134126 0
3 0 0.076023 0
4 1 0.961564 1
... ... ... ...
174 0 0.229018 0
175 1 0.751402 1
176 0 0.081053 0
177 0 0.095547 0
178 0 0.107695 0

179 rows × 3 columns

In [53]:
# Lets try the model performance with different thresholds
numbers = [float(x)/10 for x in range(0, 11)]
print("Thresholds:", numbers)

for threshold in numbers:
    cutoff_df[threshold] = cutoff_df.Predicted_Prob.map(lambda x: 1 if x > threshold else 0)

cutoff_df.head()
Thresholds: [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
Out[53]:
Actual Predicted_Prob Predicted_Class 0.0 0.05 0.1 0.15 0.2 0.25 0.3 0.35 0.4 0.45 0.5 0.6 0.7 0.8 0.9 1.0
0 0 0.032092 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 0 0.067509 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 0 0.134126 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
3 0 0.076023 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 1 0.961564 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
In [54]:
from sklearn.metrics import confusion_matrix
# TP = confusion_matrix[1,1] # true positive
# TN = confusion_matrix[0,0] # true negative
# FP = confusion_matrix[0,1] # false positive
# FN = confusion_matrix[1,0] # false negative
num = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

cutoff_df1 = pd.DataFrame(columns=['prob', 'accuracy', 'sensi', 'speci'])
for threshold in num:
    cm1 = confusion_matrix(cutoff_df.Actual, cutoff_df[threshold])
    total1 = cm1.sum()
    Accuracy = (cm1[0, 0] + cm1[1, 1]) / total1
    Specificity = cm1[0, 0] / (cm1[0, 0] + cm1[0, 1])
    Sensitivity = cm1[1, 1] / (cm1[1, 1] + cm1[1, 0])
    cutoff_df1.loc[threshold] = [threshold, Accuracy, Sensitivity, Specificity]

cutoff_df1
Out[54]:
prob accuracy sensi speci
0.0 0.0 0.385475 1.000000 0.000000
0.1 0.1 0.675978 0.855072 0.563636
0.2 0.2 0.731844 0.811594 0.681818
0.3 0.3 0.787709 0.797101 0.781818
0.4 0.4 0.787709 0.724638 0.827273
0.5 0.5 0.815642 0.695652 0.890909
0.6 0.6 0.798883 0.608696 0.918182
0.7 0.7 0.765363 0.449275 0.963636
0.8 0.8 0.748603 0.362319 0.990909
0.9 0.9 0.720670 0.289855 0.990909
1.0 1.0 0.614525 0.000000 1.000000
In [55]:
# Lets plot the accuracy, sensitivity, and specificity against the threshold
plt.figure(figsize=(12, 6))
plt.plot(cutoff_df1['prob'], cutoff_df1['accuracy'], marker='o', label='Accuracy')
plt.plot(cutoff_df1['prob'], cutoff_df1['sensi'], marker='o', label='Sensitivity')
plt.plot(cutoff_df1['prob'], cutoff_df1['speci'], marker='o', label='Specificity')
plt.title('Model Performance Metrics vs Threshold')
plt.xlabel('Threshold')
plt.ylabel('Metric Value')
plt.legend()
plt.grid()
plt.show()
No description has been provided for this image

image.png

In [64]:
# Since the best value of threshold is 0.33 as per the above graph, we will use this threshold for our final predictions.
threshold = 0.33
final_predictions = np.where(logreg.predict_proba(X_test)[:, 1] > threshold, 1, 0)
final_predictions
Out[64]:
array([0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0])
In [65]:
accuracy_score(y_test, final_predictions)
Out[65]:
0.7988826815642458
In [74]:
# My Style of Coding:
# Since the best value of threshold is 0.33 as per the above graph, we will use this threshold for our final predictions.
threshold = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1] 
final_predictions_thr00 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.0, 1, 0)
final_predictions_thr05 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.05, 1, 0)
final_predictions_thr10 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.1, 1, 0)
final_predictions_thr15 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.15, 1, 0)
final_predictions_thr20 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.2, 1, 0)
final_predictions_thr25 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.25, 1, 0)
final_predictions_thr30 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.3, 1, 0)
final_predictions_thr35 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.35, 1, 0)
final_predictions_thr40 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.4, 1, 0)
final_predictions_thr45 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.45, 1, 0)
final_predictions_thr50 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.5, 1, 0)
final_predictions_thr55 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.55, 1, 0)
final_predictions_thr60 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.6, 1, 0)
final_predictions_thr65 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.65, 1, 0)
final_predictions_thr70 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.7, 1, 0)
final_predictions_thr75 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.75, 1, 0)
final_predictions_thr80 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.8, 1, 0)
final_predictions_thr85 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.85, 1, 0)
final_predictions_thr90 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.9, 1, 0)
final_predictions_thr95 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.95, 1, 0)
final_predictions_thr100 = np.where(logreg.predict_proba(X_test)[:, 1] > 1.0, 1, 0)

# Calculating accuracy for different thresholds
from sklearn.metrics import accuracy_score
accuracy_thr00 = accuracy_score(y_test, final_predictions_thr00)
accuracy_thr05 = accuracy_score(y_test, final_predictions_thr05)
accuracy_thr10 = accuracy_score(y_test, final_predictions_thr10)
accuracy_thr15 = accuracy_score(y_test, final_predictions_thr15)
accuracy_thr20 = accuracy_score(y_test, final_predictions_thr20)
accuracy_thr25 = accuracy_score(y_test, final_predictions_thr25)
accuracy_thr30 = accuracy_score(y_test, final_predictions_thr30)
accuracy_thr35 = accuracy_score(y_test, final_predictions_thr35)
accuracy_thr40 = accuracy_score(y_test, final_predictions_thr40)
accuracy_thr45 = accuracy_score(y_test, final_predictions_thr45)
accuracy_thr50 = accuracy_score(y_test, final_predictions_thr50)
accuracy_thr55 = accuracy_score(y_test, final_predictions_thr55)
accuracy_thr60 = accuracy_score(y_test, final_predictions_thr60)
accuracy_thr65 = accuracy_score(y_test, final_predictions_thr65)
accuracy_thr70 = accuracy_score(y_test, final_predictions_thr70)
accuracy_thr75 = accuracy_score(y_test, final_predictions_thr75)
accuracy_thr80 = accuracy_score(y_test, final_predictions_thr80)
accuracy_thr85 = accuracy_score(y_test, final_predictions_thr85)
accuracy_thr90 = accuracy_score(y_test, final_predictions_thr90)
accuracy_thr95 = accuracy_score(y_test, final_predictions_thr95)
accuracy_thr100 = accuracy_score(y_test, final_predictions_thr100)

print("Accuracy for threshold 0.00:", accuracy_thr00)
print("Accuracy for threshold 0.05:", accuracy_thr05)
print("Accuracy for threshold 0.10:", accuracy_thr10)
print("Accuracy for threshold 0.15:", accuracy_thr15)
print("Accuracy for threshold 0.20:", accuracy_thr20)
print("Accuracy for threshold 0.25:", accuracy_thr25)
print("Accuracy for threshold 0.30:", accuracy_thr30)
print("Accuracy for threshold 0.35:", accuracy_thr35)
print("Accuracy for threshold 0.40:", accuracy_thr40)
print("Accuracy for threshold 0.45:", accuracy_thr45)
print("Accuracy for threshold 0.50:", accuracy_thr50)
print("Accuracy for threshold 0.55:", accuracy_thr55)
print("Accuracy for threshold 0.60:", accuracy_thr60)
print("Accuracy for threshold 0.65:", accuracy_thr65)
print("Accuracy for threshold 0.70:", accuracy_thr70)
print("Accuracy for threshold 0.75:", accuracy_thr75)
print("Accuracy for threshold 0.80:", accuracy_thr80)
print("Accuracy for threshold 0.85:", accuracy_thr85)
print("Accuracy for threshold 0.90:", accuracy_thr90)
print("Accuracy for threshold 0.95:", accuracy_thr95)
print("Accuracy for threshold 1.00:", accuracy_thr100)
Accuracy for threshold 0.00: 0.3854748603351955
Accuracy for threshold 0.05: 0.4692737430167598
Accuracy for threshold 0.10: 0.6759776536312849
Accuracy for threshold 0.15: 0.7094972067039106
Accuracy for threshold 0.20: 0.7318435754189944
Accuracy for threshold 0.25: 0.776536312849162
Accuracy for threshold 0.30: 0.7877094972067039
Accuracy for threshold 0.35: 0.7988826815642458
Accuracy for threshold 0.40: 0.7877094972067039
Accuracy for threshold 0.45: 0.8044692737430168
Accuracy for threshold 0.50: 0.8156424581005587
Accuracy for threshold 0.55: 0.8044692737430168
Accuracy for threshold 0.60: 0.7988826815642458
Accuracy for threshold 0.65: 0.7821229050279329
Accuracy for threshold 0.70: 0.7653631284916201
Accuracy for threshold 0.75: 0.7597765363128491
Accuracy for threshold 0.80: 0.7486033519553073
Accuracy for threshold 0.85: 0.7262569832402235
Accuracy for threshold 0.90: 0.7206703910614525
Accuracy for threshold 0.95: 0.664804469273743
Accuracy for threshold 1.00: 0.6145251396648045
In [75]:
# My Style of Coding:
threshold = [0.45, 0.46, 0.47, 0.48, 0.49, 0.50, 0.51, 0.52, 0.53, 0.54, 0.55]

final_predictions_thr045 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.45, 1, 0)
final_predictions_thr046 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.46, 1, 0)
final_predictions_thr047 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.47, 1, 0)
final_predictions_thr048 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.48, 1, 0)
final_predictions_thr049 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.49, 1, 0)
final_predictions_thr050 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.50, 1, 0)
final_predictions_thr051 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.51, 1, 0)
final_predictions_thr052 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.52, 1, 0)
final_predictions_thr053 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.53, 1, 0)
final_predictions_thr054 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.54, 1, 0)
final_predictions_thr055 = np.where(logreg.predict_proba(X_test)[:, 1] > 0.55, 1, 0)

# Calculating accuracy for different thresholds
accuracy_thr045 = accuracy_score(y_test, final_predictions_thr045)
accuracy_thr046 = accuracy_score(y_test, final_predictions_thr046)
accuracy_thr047 = accuracy_score(y_test, final_predictions_thr047)
accuracy_thr048 = accuracy_score(y_test, final_predictions_thr048)
accuracy_thr049 = accuracy_score(y_test, final_predictions_thr049)
accuracy_thr050 = accuracy_score(y_test, final_predictions_thr050)
accuracy_thr051 = accuracy_score(y_test, final_predictions_thr051)
accuracy_thr052 = accuracy_score(y_test, final_predictions_thr052)
accuracy_thr053 = accuracy_score(y_test, final_predictions_thr053)
accuracy_thr054 = accuracy_score(y_test, final_predictions_thr054)
accuracy_thr055 = accuracy_score(y_test, final_predictions_thr055)

print("Accuracy for threshold 0.45:", accuracy_thr045)
print("Accuracy for threshold 0.46:", accuracy_thr046)
print("Accuracy for threshold 0.47:", accuracy_thr047)
print("Accuracy for threshold 0.48:", accuracy_thr048)
print("Accuracy for threshold 0.49:", accuracy_thr049)
print("Accuracy for threshold 0.50:", accuracy_thr050)
print("Accuracy for threshold 0.51:", accuracy_thr051)
print("Accuracy for threshold 0.52:", accuracy_thr052)
print("Accuracy for threshold 0.53:", accuracy_thr053)
print("Accuracy for threshold 0.54:", accuracy_thr054)
print("Accuracy for threshold 0.55:", accuracy_thr055)
Accuracy for threshold 0.45: 0.8044692737430168
Accuracy for threshold 0.46: 0.8156424581005587
Accuracy for threshold 0.47: 0.8156424581005587
Accuracy for threshold 0.48: 0.8156424581005587
Accuracy for threshold 0.49: 0.8156424581005587
Accuracy for threshold 0.50: 0.8156424581005587
Accuracy for threshold 0.51: 0.8044692737430168
Accuracy for threshold 0.52: 0.8100558659217877
Accuracy for threshold 0.53: 0.8156424581005587
Accuracy for threshold 0.54: 0.8044692737430168
Accuracy for threshold 0.55: 0.8044692737430168
In [76]:
# Accuracy of 0.8156424581005587 is the best accuracy we got with threshold of 0.5 (some others as well)
# So we can say that the model is performing well with an accuracy of 81.56% on the test set at threshold of 0.5
In [ ]:
 

class sklearn.linear_model.LogisticRegression(penalty='l2', *, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='deprecated', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)¶


1. penalty (default='l2')¶

  • What it does: Specifies the regularization technique used to avoid overfitting.

  • Options:

    • 'l1': Lasso regularization (can zero out coefficients).
    • 'l2': Ridge regularization (shrinks coefficients but doesn't zero them).
    • 'elasticnet': Mix of both (l1_ratio controls the balance).
    • 'none': No regularization.
  • Common Use: 'l2' is most common.


2. dual (default=False)¶

  • What it does: Chooses the dual formulation of the optimization problem.
  • When to use: Set to True only when using liblinear solver and when the number of samples < number of features.
  • Common Use: Usually False.

3. tol (default=0.0001)¶

  • What it does: Tolerance for stopping criteria. Smaller values make the model more precise but slower to train.
  • Common Use: Default is fine in most cases.

4. C (default=1.0)¶

  • What it does: Inverse of regularization strength. Smaller values mean stronger regularization.
  • Tip: It’s the most important tuning parameter.
  • Common Range: 0.01 to 100. Default is 1.0.

5. fit_intercept (default=True)¶

  • What it does: Adds an intercept (bias) term to the model.
  • Common Use: Keep it True, unless data is already centered.

6. intercept_scaling (default=1)¶

  • What it does: Only used when solver='liblinear' and fit_intercept=True. Scales the intercept term.
  • Common Use: Rarely changed.

7. class_weight (default=None)¶

  • What it does: Adjusts weights for classes to handle imbalanced data.
  • Common Values: 'balanced' or custom dictionary (e.g., {0:1, 1:3}).
  • Common Use: Very useful in fraud, medical, or other imbalanced datasets.

8. random_state (default=None)¶

  • What it does: Seed for random number generator. Helps ensure reproducibility.
  • Common Use: Set to an integer like 42.

9. solver (default='lbfgs')¶

  • What it does: Optimization algorithm used to find the best coefficients.

  • Options:

    • 'liblinear': Good for small datasets and supports 'l1'.
    • 'lbfgs': Fast and works well for most cases.
    • 'saga': Supports l1, l2, and elasticnet, good for large datasets.
  • Common Use: 'lbfgs' or 'saga'.


10. max_iter (default=100)¶

  • What it does: Maximum number of iterations for the solver to converge.
  • Tip: Increase if model doesn’t converge (e.g., set to 500).
  • Common Use: Default is fine unless you get a convergence warning.

11. multi_class (default='deprecated')¶

  • What it does: Determines how to handle multiple classes.
  • Current Behavior: Automatically chosen based on solver.
  • Common Values: 'ovr' (one-vs-rest), 'multinomial' (better for multiclass).
  • Note: Explicitly set 'multinomial' when using 'lbfgs' or 'saga' with multiclass problems.

12. verbose (default=0)¶

  • What it does: Controls how much output is shown during training.
  • Common Use: Set to a positive integer if you want to debug.

13. warm_start (default=False)¶

  • What it does: Reuses solution from previous fit to speed up next one.
  • Common Use: Rarely used unless doing iterative training.

14. n_jobs (default=None)¶

  • What it does: Number of CPU cores used. Set to -1 to use all available.
  • Common Use: Helps speed up training in large datasets.

15. l1_ratio (default=None)¶

  • What it does: Only used when penalty='elasticnet'. It balances l1 and l2 regularization.
  • Common Use: Needs tuning if elasticnet is selected. Value between 0 and 1.

Most Important and Commonly Tuned Parameters:¶

Parameter Why Important Common Settings
C Controls regularization strength 0.01, 0.1, 1, 10
penalty Regularization method 'l2', or 'l1' if sparse
solver Impacts speed and regularization options 'lbfgs', 'liblinear', 'saga'
class_weight Handles imbalance None or 'balanced'
max_iter Needed for convergence in some cases 100, 200, 500

In [77]:
# class sklearn.linear_model.LogisticRegression(penalty='l2', *, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='deprecated', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)

from sklearn.linear_model import LogisticRegression
logreg2 = LogisticRegression(C=0.5)
logreg2.fit(X_train, y_train)

# Predicting on training and test sets with the new model
y_pred_train2 = logreg2.predict(X_train)
y_pred_test2 = logreg2.predict(X_test)

# Evaluating the new model performance
train_acc2 = accuracy_score(y_train, y_pred_train2)
test_acc2 = accuracy_score(y_test, y_pred_test2)
print(f"Training Accuracy with C=0.5: {train_acc2}")
print(f"Testing Accuracy with C=0.5: {test_acc2}")
Training Accuracy with C=0.5: 0.8314606741573034
Testing Accuracy with C=0.5: 0.8100558659217877
In [78]:
from sklearn.linear_model import LogisticRegression
logreg3 = LogisticRegression(C=1.5)
logreg3.fit(X_train, y_train)

# Predicting on training and test sets with the new model
y_pred_train3 = logreg3.predict(X_train)
y_pred_test3 = logreg3.predict(X_test)

# Evaluating the new model performance
train_acc3 = accuracy_score(y_train, y_pred_train3)
test_acc3 = accuracy_score(y_test, y_pred_test3)
print(f"Training Accuracy with C=1.5: {train_acc3}")
print(f"Testing Accuracy with C=1.5: {test_acc3}")
Training Accuracy with C=1.5: 0.8328651685393258
Testing Accuracy with C=1.5: 0.8100558659217877
In [79]:
# Try another set of Logistic Regression parameters
logreg4 = LogisticRegression(C=0.1, penalty='l1', solver='liblinear')
logreg4.fit(X_train, y_train)

# Predicting on training and test sets with the new model
y_pred_train4 = logreg4.predict(X_train)
y_pred_test4 = logreg4.predict(X_test)

# Evaluating the new model performance
train_acc4 = accuracy_score(y_train, y_pred_train4)
test_acc4 = accuracy_score(y_test, y_pred_test4)
print(f"Training Accuracy with C=0.1, penalty='l1': {train_acc4}")
print(f"Testing Accuracy with C=0.1, penalty='l1': {test_acc4}")
Training Accuracy with C=0.1, penalty='l1': 0.8061797752808989
Testing Accuracy with C=0.1, penalty='l1': 0.7821229050279329
In [80]:
# Note:
# What all can be do to improve the model performance?
# 1. Feature Engineering: Create new features or modify existing ones to better capture the relationships in the data.
# 2. Hyperparameter Tuning: Experiment with different values for hyperparameters like C, penalty, and solver to find the best combination for your model.
# 3. Change Train-Test Split Ratio: Adjust the ratio of training to testing data to see if it affects model performance.
# 4. Change the random_state in train_test_split to see if it affects model performance.
# 5. Try different scaling techniques like Min-Max Scaling or Robust Scaling.
# 6. Try different Machine Learning algorithms like Decision Trees, Random Forests, or Support Vector Machines to see if they perform better on the dataset.(later)
In [81]:
# Task to do now:
# https://www.kaggle.com/code/neisha/heart-disease-prediction-using-logistic-regression

Multiclass¶

In [82]:
# Logistic Regression for Multiclass Classification
# For multiclass classification, we can use the One-vs-Rest (OvR)
# Dataset: Iris dataset of Seaborn library

import seaborn as sns
iris = sns.load_dataset('iris')
iris.head()
Out[82]:
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
In [83]:
# Divide the dataset into features i.e. X and target variable i.e. y
X = iris.drop('species', axis=1)
y = iris['species']
In [84]:
# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

# Print the shapes of the train and test sets
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
X_train shape: (120, 4), y_train shape: (120,)
X_test shape: (30, 4), y_test shape: (30,)
In [86]:
iris.columns
# Scale the features
features_to_be_scaled = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[features_to_be_scaled] = sc.fit_transform(X_train[features_to_be_scaled])
X_test[features_to_be_scaled] = sc.transform(X_test[features_to_be_scaled])

image.png

image.png

In [87]:
# Modeling - apply the Logistic Regression algorithm for multiclass classification
from sklearn.linear_model import LogisticRegression
logreg_multi = LogisticRegression(multi_class='ovr', random_state=0)
# multi_class='ovr' means One-vs-Rest strategy. This is cumpulsory for multiclass classification.
logreg_multi.fit(X_train, y_train)

# Predicting on training and test sets
y_pred_train_multi = logreg_multi.predict(X_train)
y_pred_test_multi = logreg_multi.predict(X_test)

# Evaluating the model performance
from sklearn.metrics import accuracy_score, confusion_matrix
train_acc_multi = accuracy_score(y_train, y_pred_train_multi)
test_acc_multi = accuracy_score(y_test, y_pred_test_multi)
print(f"Training Accuracy for Multiclass: {train_acc_multi}")
print(f"Testing Accuracy for Multiclass: {test_acc_multi}")

confusion_mat_multi = confusion_matrix(y_test, y_pred_test_multi)
cm_df_multi = pd.DataFrame(confusion_mat_multi, index=iris['species'].unique(),
                            columns=iris['species'].unique())
cm_df_multi
Training Accuracy for Multiclass: 0.925
Testing Accuracy for Multiclass: 0.9
Out[87]:
setosa versicolor virginica
setosa 10 0 0
versicolor 0 8 2
virginica 0 1 9
In [95]:
# Predicting on X_test.iloc[0, :].values
logreg_multi.predict(X_test.iloc[0, :].values.reshape(1, -1))
Out[95]:
array(['setosa'], dtype=object)
In [96]:
# Predicting on X_test.iloc[0, :].values
logreg_multi.predict_proba(X_test.iloc[0, :].values.reshape(1, -1))
Out[96]:
array([[9.27179095e-01, 7.27948768e-02, 2.60285132e-05]])
In [97]:
np.sum(logreg_multi.predict_proba(X_test.iloc[0, :].values.reshape(1, -1)), axis=1)
Out[97]:
array([1.])
In [99]:
# Lets try one more
# Predicting on X_test.iloc[10, :].values
logreg_multi.predict(X_test.iloc[10, :].values.reshape(1, -1))
Out[99]:
array(['virginica'], dtype=object)
In [101]:
# Predicting on X_test.iloc[10, :].values
logreg_multi.predict_proba(X_test.iloc[10, :].values.reshape(1, -1))
Out[101]:
array([[0.00182399, 0.10009919, 0.89807683]])
In [102]:
np.sum(logreg_multi.predict_proba(X_test.iloc[10, :].values.reshape(1, -1)))
Out[102]:
1.0

Happy Learning¶